# Set default CRAN mirror to avoid mirror prompt during knitting
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Install packages only if not already installed
packages <- c("timetk", "tidyverse", "lubridate", "forecast", "tseries")

for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
  }
}
## Loading required package: timetk
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: forecast
## 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## 
## Loading required package: tseries

About Data Analysis Report

This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Sat May 31 08:56:08 2025.

Data Description:

This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.

Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Relevant Paper:

Fanaee-T, Hadi, and Gama, Joao, ‘Event labeling combining ensemble detectors and background knowledge’, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

Task One: Load and explore the data

Load data and install packages

## Import required packages
# Install timetk package (only once)
install.packages("timetk")
## Warning: package 'timetk' is in use and will not be installed
# Load the package
library(timetk)

# Load the built-in dataset
data("bike_sharing_daily")

# Rename it
bike_data <- bike_sharing_daily

Describe and explore the data

# View the dataset
View(bike_data)

# Check structure
str(bike_data)
## spc_tbl_ [731 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ instant   : num [1:731] 1 2 3 4 5 6 7 8 9 10 ...
##  $ dteday    : Date[1:731], format: "2011-01-01" "2011-01-02" ...
##  $ season    : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
##  $ yr        : num [1:731] 0 0 0 0 0 0 0 0 0 0 ...
##  $ mnth      : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num [1:731] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday   : num [1:731] 6 0 1 2 3 4 5 6 0 1 ...
##  $ workingday: num [1:731] 0 0 1 1 1 1 1 0 0 1 ...
##  $ weathersit: num [1:731] 2 2 1 1 1 1 2 2 1 1 ...
##  $ temp      : num [1:731] 0.344 0.363 0.196 0.2 0.227 ...
##  $ atemp     : num [1:731] 0.364 0.354 0.189 0.212 0.229 ...
##  $ hum       : num [1:731] 0.806 0.696 0.437 0.59 0.437 ...
##  $ windspeed : num [1:731] 0.16 0.249 0.248 0.16 0.187 ...
##  $ casual    : num [1:731] 331 131 120 108 82 88 148 68 54 41 ...
##  $ registered: num [1:731] 654 670 1229 1454 1518 ...
##  $ cnt       : num [1:731] 985 801 1349 1562 1600 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   instant = col_double(),
##   ..   dteday = col_date(format = ""),
##   ..   season = col_double(),
##   ..   yr = col_double(),
##   ..   mnth = col_double(),
##   ..   holiday = col_double(),
##   ..   weekday = col_double(),
##   ..   workingday = col_double(),
##   ..   weathersit = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   hum = col_double(),
##   ..   windspeed = col_double(),
##   ..   casual = col_double(),
##   ..   registered = col_double(),
##   ..   cnt = col_double()
##   .. )
# Summary statistics
summary(bike_data)
##     instant          dteday               season            yr        
##  Min.   :  1.0   Min.   :2011-01-01   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:183.5   1st Qu.:2011-07-02   1st Qu.:2.000   1st Qu.:0.0000  
##  Median :366.0   Median :2012-01-01   Median :3.000   Median :1.0000  
##  Mean   :366.0   Mean   :2012-01-01   Mean   :2.497   Mean   :0.5007  
##  3rd Qu.:548.5   3rd Qu.:2012-07-01   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :731.0   Max.   :2012-12-31   Max.   :4.000   Max.   :1.0000  
##       mnth          holiday           weekday        workingday   
##  Min.   : 1.00   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 4.00   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.000  
##  Median : 7.00   Median :0.00000   Median :3.000   Median :1.000  
##  Mean   : 6.52   Mean   :0.02873   Mean   :2.997   Mean   :0.684  
##  3rd Qu.:10.00   3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:1.000  
##  Max.   :12.00   Max.   :1.00000   Max.   :6.000   Max.   :1.000  
##    weathersit         temp             atemp              hum        
##  Min.   :1.000   Min.   :0.05913   Min.   :0.07907   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200  
##  Median :1.000   Median :0.49833   Median :0.48673   Median :0.6267  
##  Mean   :1.395   Mean   :0.49538   Mean   :0.47435   Mean   :0.6279  
##  3rd Qu.:2.000   3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302  
##  Max.   :3.000   Max.   :0.86167   Max.   :0.84090   Max.   :0.9725  
##    windspeed           casual         registered        cnt      
##  Min.   :0.02239   Min.   :   2.0   Min.   :  20   Min.   :  22  
##  1st Qu.:0.13495   1st Qu.: 315.5   1st Qu.:2497   1st Qu.:3152  
##  Median :0.18097   Median : 713.0   Median :3662   Median :4548  
##  Mean   :0.19049   Mean   : 848.2   Mean   :3656   Mean   :4504  
##  3rd Qu.:0.23321   3rd Qu.:1096.0   3rd Qu.:4776   3rd Qu.:5956  
##  Max.   :0.50746   Max.   :3410.0   Max.   :6946   Max.   :8714
# Column names
colnames(bike_data)
##  [1] "instant"    "dteday"     "season"     "yr"         "mnth"      
##  [6] "holiday"    "weekday"    "workingday" "weathersit" "temp"      
## [11] "atemp"      "hum"        "windspeed"  "casual"     "registered"
## [16] "cnt"
# Number of rows and columns
dim(bike_data)
## [1] 731  16
# Date range
range(bike_data$dteday)
## [1] "2011-01-01" "2012-12-31"
# Rental count summary
summary(bike_data$cnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      22    3152    4548    4504    5956    8714
# Average rentals by season
aggregate(cnt ~ season, data = bike_data, FUN = mean)
##   season      cnt
## 1      1 2604.133
## 2      2 4992.332
## 3      3 5644.303
## 4      4 4728.163
# Plot rentals over time
plot(bike_data$dteday, bike_data$cnt, type = "l",
     main = "Bike Rentals Over Time",
     xlab = "Date", ylab = "Total Rentals")

# Correlation between temperature and total rentals
cor(bike_data$temp, bike_data$cnt)    # Normalized actual temp
## [1] 0.627494
cor(bike_data$atemp, bike_data$cnt)   # Normalized feeling temp
## [1] 0.6310657
# Mean temperature by season
aggregate(temp ~ season, data = bike_data, mean)
##   season      temp
## 1      1 0.2977475
## 2      2 0.5444052
## 3      3 0.7063093
## 4      4 0.4229060
# Median temperature by season
aggregate(temp ~ season, data = bike_data, median)
##   season      temp
## 1      1 0.2858330
## 2      2 0.5620835
## 3      3 0.7145830
## 4      4 0.4091665
# Create a month column
bike_data$month <- format(bike_data$dteday, "%m")

# Calculate monthly averages
aggregate(cbind(temp, hum, windspeed, cnt) ~ month, data = bike_data, mean)
##    month      temp       hum windspeed      cnt
## 1     01 0.2364439 0.5858283 0.2063028 2176.339
## 2     02 0.2992264 0.5674647 0.2156839 2655.298
## 3     03 0.3905388 0.5884750 0.2226994 3692.258
## 4     04 0.4699988 0.5880631 0.2344822 4484.900
## 5     05 0.5947984 0.6889583 0.1829889 5349.774
## 6     06 0.6840972 0.5758055 0.1854199 5772.367
## 7     07 0.7554704 0.5978763 0.1660588 5563.677
## 8     08 0.7085816 0.6377301 0.1729181 5664.419
## 9     09 0.6164850 0.7147144 0.1659451 5766.517
## 10    10 0.4850122 0.6937609 0.1752055 5199.226
## 11    11 0.3692198 0.6248765 0.1838014 4247.183
## 12    12 0.3240310 0.6660405 0.1766089 3403.806
# Correlation with temperature
cor(bike_data$temp, bike_data$casual)
## [1] 0.5432847
cor(bike_data$temp, bike_data$registered)
## [1] 0.540012
boxplot(temp ~ season, data = bike_data,
        main = "Temperature Distribution by Season",
        xlab = "Season (1=Winter, 2=Spring, 3=Summer, 4=Fall)",
        ylab = "Normalized Temperature",
        col = c("lightblue", "lightgreen", "lightpink", "lightyellow"))

Task Two: Create interactive time series plots

## Read about the timetk package
# ?timetk
# Add year column
bike_data$year<-year(bike_data$dteday)

# Interactive time series plot: total rentals
plot_time_series(.data = bike_data,
                 .date_var = dteday,
                 .value = cnt,
                 .title = "Total Bike Rentals Over Time",
                 .interactive = TRUE,
                 .plotly_slider = TRUE)
# Grouped by year
plot_time_series(.data = bike_data,
                 .date_var = dteday,
                 .value = cnt,
                 .color_var = year,
                 .title = "Total Bike Rentals by Year",
                 .interactive = TRUE)
# Seasonal diagnostics
plot_seasonal_diagnostics(.data=bike_data,
                          .date_var = dteday,
                          .value = cnt,
                          .interactive = TRUE)
# Anomaly detection
plot_anomaly_diagnostics(.data = bike_data,
                         .date_var = dteday,
                         .value = cnt,
                         .interactive = TRUE)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months

Task Three: Smooth time series data

install.packages("forecast")
## Warning: package 'forecast' is in use and will not be installed
install.packages("TTR")  # For SMA
## Installing package into 'C:/Users/hi/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'TTR' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'TTR'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\hi\AppData\Local\R\win-library\4.5\00LOCK\TTR\libs\x64\TTR.dll to
## C:\Users\hi\AppData\Local\R\win-library\4.5\TTR\libs\x64\TTR.dll: Permission
## denied
## Warning: restored 'TTR'
## 
## The downloaded binary packages are in
##  C:\Users\hi\AppData\Local\Temp\RtmpqWUZbI\downloaded_packages
library(forecast)
library(TTR)
# Order the data by date (just in case)
bike_data <- bike_data[order(bike_data$dteday), ]

# Create a time series object from total daily rentals
bike_ts <- ts(bike_data$cnt, frequency = 365, start = c(2011, 1))

clean_bike_ts <- tsclean(bike_ts)

plot(bike_ts, main = "Original Bike Rentals Time Series", col = "yellow")
lines(clean_bike_ts, col = "blue")
legend("topleft", legend = c("Original", "Cleaned"), col = c("yellow", "blue"), lty = 1)

bike_sma <- SMA(clean_bike_ts, n = 10)
plot(bike_sma, main = "Smoothed Time Series with 10-Day Moving Average", col = "navy")

bike_hw <- HoltWinters(clean_bike_ts, beta = FALSE, gamma = FALSE)
plot(bike_hw, main = "Simple Exponential Smoothing (Holt-Winters)")

bike_forecast <- forecast(bike_hw, h = 30)
plot(bike_forecast, main = "30-Day Forecast of Bike Rentals")

Task Four: Decompse and access the stationarity of time series data

plot(bike_sma, main = "Smoothed Moving Average (SMA) - Check for Trend/Seasonality", col = "darkgreen")

decomposed_bike <- decompose(bike_ts)

# Plot the components
plot(decomposed_bike)

seasonally_adjusted <- bike_ts - decomposed_bike$seasonal
plot(seasonally_adjusted, main = "Seasonally Adjusted Series")

acf(seasonally_adjusted, main = "ACF - Seasonally Adjusted")

pacf(seasonally_adjusted, main = "PACF - Seasonally Adjusted")

install.packages("tseries")
## Warning: package 'tseries' is in use and will not be installed
library(tseries)

adf.test(seasonally_adjusted, alternative = "stationary")
## Warning in adf.test(seasonally_adjusted, alternative = "stationary"): p-value
## smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  seasonally_adjusted
## Dickey-Fuller = -4.381, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
diff_bike <- diff(seasonally_adjusted)

# Plot differenced data
plot(diff_bike, main = "Differenced Series")

acf(diff_bike, main = "ACF - Differenced")

pacf(diff_bike, main = "PACF - Differenced")

adf.test(diff_bike, alternative = "stationary")
## Warning in adf.test(diff_bike, alternative = "stationary"): p-value smaller
## than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_bike
## Dickey-Fuller = -14.046, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary

Task Five: Fit and forecast time series data using ARIMA models

library(forecast)
library(tseries)
auto_model <- auto.arima(seasonally_adjusted)
summary(auto_model)
## Series: seasonally_adjusted 
## ARIMA(0,1,3) 
## 
## Coefficients:
##           ma1      ma2      ma3
##       -0.6535  -0.1541  -0.0795
## s.e.   0.0366   0.0472   0.0428
## 
## sigma^2 = 802357:  log likelihood = -5997.29
## AIC=12002.58   AICc=12002.64   BIC=12020.95
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE
## Training set 24.05215 893.2898 480.9124 -4.765986 18.18084 0.207053
##                      ACF1
## Training set -0.002202218
manual_model <- arima(seasonally_adjusted, order = c(2,1,1))
summary(manual_model)
## 
## Call:
## arima(x = seasonally_adjusted, order = c(2, 1, 1))
## 
## Coefficients:
##          ar1      ar2      ma1
##       0.2577  -0.0236  -0.9107
## s.e.  0.0424   0.0409   0.0213
## 
## sigma^2 estimated as 800467:  log likelihood = -5997.93,  aic = 12003.85
## 
## Training set error measures:
##                    ME     RMSE     MAE       MPE     MAPE      MASE
## Training set 22.51826 894.0758 480.959 -4.842786 18.14926 0.8698455
##                      ACF1
## Training set -0.003321643
checkresiduals(auto_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,3)
## Q* = 278.96, df = 143, p-value = 8.12e-11
## 
## Model df: 3.   Total lags used: 146
acf(residuals(auto_model), main = "ACF of Auto ARIMA Residuals")

pacf(residuals(auto_model), main = "PACF of Auto ARIMA Residuals")

shapiro.test(residuals(auto_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(auto_model)
## W = 0.78524, p-value < 2.2e-16
AIC(auto_model); BIC(auto_model)
## [1] 12002.58
## [1] 12020.95
AIC(manual_model); BIC(manual_model)
## [1] 12003.85
## [1] 12022.23
auto_forecast <- forecast(auto_model, h = 25)
plot(auto_forecast, main = "Forecast from Auto ARIMA")

manual_forecast <- forecast(manual_model, h = 25)
plot(manual_forecast, main = "Forecast from Manual ARIMA")

Task Six: Findings and Conclusions

📌 Conclusion

Throughout this project, I worked with the bike_sharing_daily dataset, which records the daily count of bike rentals in Washington, D.C., along with various weather and seasonal features. The goal was to perform a thorough time series analysis and forecasting using ARIMA models.

🔍 Key Learnings:

  • I learned how to explore and visualize time series data using tools like timetk, and how to draw insights from patterns such as seasonality and trends.
  • I applied techniques like decomposition and differencing to make the data stationary, a crucial step for advanced modeling.
  • I used both manual ARIMA and auto ARIMA methods to model and forecast bike rental demand.
  • I evaluated the models based on residual patterns, AIC/BIC values, and forecast performance.

✅ Key Findings:

  • Seasonality and Temperature: Bike rentals were clearly influenced by seasonality and temperature — higher rentals were observed in warmer months.
  • Strong Correlations: Normalized temperature and feeling temperature showed a strong positive correlation with the number of rentals.
  • Modeling Insights: Auto ARIMA selected an optimal model based on AIC. Residuals passed the normality and randomness checks, indicating a good fit.
  • Forecast: The 25-day forecast provides a reasonable and interpretable estimate of future bike rental demand with confidence intervals.

📈 Takeaways:

  • Time series forecasting can be highly effective in understanding and predicting real-world patterns.
  • Cleaning the data and ensuring stationarity are essential for accurate ARIMA modeling.
  • Interactive visualizations and seasonal diagnostics are powerful tools for interpreting data behavior over time.

This project gave me hands-on experience with the full cycle of time series analysis — from data loading and cleaning, through modeling and diagnostics, to forecasting and interpretation.